Used dataset and DILS parameters

### load already extracted output
dilsOut <- readRDS(paste0(folder_path, "/dilsOut_withoutWeirds.rds"))

analyses <- read.delim(paste0(folder_path, "/analysis_SpLn.txt")) %>%
    dplyr::rename(analysisType = analysis, analysis = analysis_pair) %>%
    left_join(count(dilsOut, analysis)) %>%
    rename(nReps = n) %>%
    rownames_to_column("order")

set <- analyses %>%
    pivot_longer(sp_species1:sp_species2) %>%
    pull(value) %>%
    unique()

data <- read.delim(paste0(folder_path, "samples_lineages.txt"))

knitr::kable(data, format = "html") %>%
    kable_styling(fixed_thead = T, font_size = 10) %>%
    scroll_box(width = "700px", height = "300px")
sample species caste mitochondrial datatype_nuc Lineage
I37-MSTRQP1 structor queen ibericus1 RNA-seq ibericus1
I38-MSTRQP1 structor queen ibericus1 RNA-seq ibericus1
I39-MSTRQP1 structor queen ibericus1 RNA-seq ibericus1
I53-MSTRQN1 structor queen ibericus1 RNA-seq ibericus1
I54-MSTRQN1 structor queen ibericus1 RNA-seq ibericus1
Mibe structor queen ibericus1 WGS ibericus1
NYLS4Q1 structor queen ibericus1 RNA-seq ibericus1
NYLS5Q1 structor queen ibericus1 RNA-seq ibericus1
PASS5Q1 structor queen ibericus1 RNA-seq ibericus1
PASS6M1 structor male ibericus1 RNA-seq ibericus1
PASS6Q1 structor queen ibericus1 RNA-seq ibericus1
PASS8Q1 structor queen ibericus1 RNA-seq ibericus1
POLL1Q1 structor queen ibericus1 RNA-seq ibericus1
RFOU2M structor male ibericus1 WGS ibericus1
RINSAM structor male ibericus1 WGS ibericus1
RINSAQ structor queen ibericus1 WGS ibericus1
SRR4292927 structor queen ibericus1 RNA-seq ibericus1
SRR4292928 structor queen ibericus1 RNA-seq ibericus1
SRR4292930 structor queen ibericus1 RNA-seq ibericus1
I31-MAEGW aegyptiacus worker mMebe3 RNA-seq Maeg
Maeg aegyptiacus queen mMebe3 WGS Maeg
SRR4292920 barbarus queen mMbar1 RNA-seq Mbar1
SRR4292931 barbarus male mMbar1 RNA-seq Mbar1
SRR4292935 barbarus queen mMbar1 RNA-seq Mbar1
SRR4292902 barbarus queen mMbar2 RNA-seq Mbar2
SRR4292903 barbarus queen mMbar2 RNA-seq Mbar2
SRR4292909 barbarus queen mMbar2 RNA-seq Mbar2
SRR4292936 barbarus male mMbar2 RNA-seq Mbar2
SRR4292908 capitatus queen capitatus RNA-seq Mcap
SRR4292910 capitatus queen capitatus RNA-seq Mcap
I15-MEBEQ13 ebeninus queen mMebe1 RNA-seq Mebe
I16-MEBEQ9 ebeninus queen mMebe2 RNA-seq Mebe
I17-MEBEQ23 ebeninus queen mMebe2 RNA-seq Mebe
I18-MEBEQ5 ebeninus queen mMebe2 RNA-seq Mebe
I19-MEBEQ19 ebeninus queen mMebe1 RNA-seq Mebe
I20-MEBEM3 ebeninus male mMebe1 RNA-seq Mebe
SRR4292914 ebeninus queen mMebe1 RNA-seq Mebe
Mtar structor queen muticus6 WGS muticus6
Y15472-1 structor queen muticus6 WGS muticus6
LCR4Q wasmanni queen wasmanni WGS Mwas
SRR4292932 wasmanni queen wasmanni RNA-seq Mwas
RDNIPQ structor queen ponticus2 WGS ponticus2
RPODGQ structor queen ibericus1 WGS ponticus2
SRR4292916 structor queen ponticus2 RNA-seq ponticus2
SRR4292917 structor queen ponticus2 RNA-seq ponticus2
M8914AQ structor queen structor3 WGS structor3
M893AQ structor queen structor3 WGS structor3
SRR4292926 structor queen structor3 RNA-seq structor3
M883AQ structor queen structor4 WGS structor4
M897AQ structor queen structor4 WGS structor4
samplesData <- data %>%
    group_by(Lineage) %>%
    tally() %>%
    left_join(pivot_longer(data = analyses, cols = sp_species1:sp_species2, values_to = "Lineage"),
        by = "Lineage") %>%
    select(-Lineage) %>%
    pivot_wider(names_from = name, names_sep = "nSamples_{name}", values_from = n)

dilsOut <- left_join(dilsOut, samplesData) %>%
    mutate(analysisType = case_when(analysis %in% "Mbar_Mstr" ~ "species", TRUE ~
        analysisType))
parameters <- dilsOut %>%
    select("/config.yaml") %>%
    unnest(cols = c("/config.yaml")) %>%
    filter(!V1 %in% c("infile:", "region:", "nspecies:", "nameA:", "nameB:", "nameOutgroup:",
        "config_yaml:", "timeStamp:")) %>%
    distinct()

knitr::kable(parameters, col.names = c("Parameters", "Values"), format = "html")
Parameters Values
useSFS: 1
lightMode: FALSE
population_growth: variable
modeBarrier: bimodal
max_N_tolerated: 0.1
Lmin: 30
nMin: 2
mu: 3e-09
rho_over_theta: 0.1
N_min: 100
N_max: 1200000
Tsplit_min: 100
Tsplit_max: 10000000
M_min: 0.4
M_max: 20


  • Should rerun analysis without RPODGQ ponticus (nuclear more like ponticus while mtDNA more like ibericus1).

  • Should also rerun without weird SRR4292926 structor3 sample (closer to structor3 workers than other two structor3 queens). Not sure if it is a potential structorM like sample or because this is RNAseq, but for other lineages with different type of data the localization in the tree is not so weird as this sample.

  • Not enough queens to include mcarthuri7. Made some tests comparing ibericus1 and ponticus2 to including structor3+structor4+structor6 as well as the different combinations of two of these lineages.


gof <- dilsOut %>%
  select("analysis", "species1", "species2", "rep",
         "/gof_2/goodness_of_fit_test.txt") %>%  ##expectations based on optimized posterior
  group_by(analysis, species1, species2, rep) %>%
  unnest(cols = c(`/gof_2/goodness_of_fit_test.txt`)) %>%
  select(analysis, stats, pvals_fdr_corrected) %>%
  filter(stats %in% c("FST_avg", "FST_std", "netdivAB_avg", "netdivAB_std", 
                      "piA_avg", "piA_std", "piB_avg", "piB_std")) %>%
  group_by(analysis, stats) %>%
  add_count(sign.pvalue = (pvals_fdr_corrected < 0.05)) %>%
  left_join(analyses[,c('analysis','order')]) %>%
  arrange(order)

bad <- unique(gof[gof$pvals_fdr_corrected < 0.05, c('rep','analysis')]) %>%
  mutate(repanalysis = paste0(rep,analysis))

dilsOutF <- dilsOut %>%
  mutate(repanalysis = paste0(rep,analysis)) %>%
  filter(!repanalysis %in% bad$repanalysis) %>%
  select(-repanalysis)

# filter the analyses which at least one stat had a significant p-value

#### Get best model ####
bestModel <- dilsOutF %>%
  select("analysis", "analysisType","species1", "species2", "rep",
         '/modelComp/hierarchical_models.txt') %>%
  group_by(analysis, species1, species2, rep) %>%
  unnest(cols = c(`/modelComp/hierarchical_models.txt`)) %>%
  mutate(rows = c("bestModel","bestModelProb")) %>%
  pivot_wider(values_from = contains("versus"),
              names_from = rows) 

bestModelw <- bestModel %>%
  pivot_longer(cols = !c(analysis,analysisType,species1,species2, rep),
               names_to = 'modelTest') %>%
  mutate(bestModel = word(modelTest,2, sep = "_"),
         modelTest = word(modelTest,1, sep = "_")) %>%
  pivot_wider(names_from = bestModel, values_from = value) %>%
  drop_na(bestModel) 

bestModelw$modelTest <- factor(bestModelw$modelTest,
                               levels = c("migration versus isolation", 
                                          "AM versus SI", "IM versus SC",
                                          "N-homo versus N-hetero",
                                          "M-homo versus M-hetero"))

colModels <- data.frame(models = c("isolation", "migration","SI", "AM", "IM", "SC","Nhomo","Nhetero","Mhomo","Mhetero"),
                        cols = c("#5D8937","#95548D","#435F47","#99A565","#7134D3", "#C384C2","#A0A3BC", "#89CEAE", "#7E90C4", "#CD5CBD"))

dilsOutFBest <- dilsOut %>%
  mutate(repanalysis = paste0(rep,analysis)) %>%
  filter(!repanalysis %in% bad$repanalysis) %>%
  select(-repanalysis)

bestReps <- dilsOutFBest %>%
  select("analysis","analysisType", "species1", "species2", "rep",
         '/modelComp/hierarchical_models.txt') %>%
  group_by(analysis, analysisType, species1, species2, rep) %>%
  unnest(cols = c(`/modelComp/hierarchical_models.txt`)) %>%
  mutate(rows = c("bestModel","bestModelProb")) %>%
  filter(rows %in% 'bestModelProb') %>%
  select(analysis, analysisType, rep, `migration versus isolation`) %>%
  group_by(analysis) %>%
  filter(`migration versus isolation` == max(`migration versus isolation`)) %>%
  mutate(toFilter = paste0(analysis,rep))

Data summary stats

sumStats <- dilsOut %>%
    select(analysis, "rep", `/ABCstat_global.txt`) %>%
    unnest(cols = c("/ABCstat_global.txt")) %>%
    select(analysis, rep, piA_avg, piB_avg, thetaA_avg, thetaB_avg, netdivAB_avg,
        divAB_avg)

### Diversity
pis <- sumStats %>%
    pivot_longer(cols = piA_avg:thetaB_avg, names_to = "stats") %>%
    mutate(lineage = case_when(str_detect(stats, "A_avg") ~ word(analysis, sep = "_",
        1), str_detect(stats, "B_avg") ~ word(analysis, sep = "_", 2)), stats = case_when(str_detect(stats,
        "pi") ~ "pi_avg", str_detect(stats, "theta") ~ "theta_avg")) %>%
    distinct() %>%
    mutate(toFilter = paste0(analysis, rep)) %>%
    filter(toFilter %in% bestReps$toFilter) %>%
    left_join(analyses[, c("analysis", "order")]) %>%
    arrange(as.numeric(order))

p <- ggplot(pis) + geom_point(aes(x = value, y = fct_rev(fct_inorder(lineage)), color = stats),
    position = position_dodge(width = 0.5), alpha = 0.5) + labs(x = "Genetic diversity",
    y = "Lineages") + theme_bw() + theme(legend.position = "right") + scale_color_manual(values = iwanthue(10)[1:2])

#### Get divergences ####
dis <- sumStats %>%
    select(analysis, rep, netdivAB_avg, divAB_avg) %>%
    rename(dxy_avg = divAB_avg, da_avg = netdivAB_avg) %>%
    pivot_longer(cols = da_avg:dxy_avg, names_to = "stats") %>%
    mutate(toFilter = paste0(analysis, rep)) %>%
    filter(toFilter %in% bestReps$toFilter) %>%
    left_join(analyses[, c("analysis", "order")]) %>%
    arrange(as.numeric(order))

dxyda <- ggplot(dis) + geom_point(aes(y = fct_rev(fct_inorder(analysis)), x = value,
    color = stats, group = stats), position = position_dodge(width = 0.5), alpha = 0.5) +
    labs(x = "Divergences", y = "Lineages comparison") + theme_bw() + theme(legend.position = "right") +
    scale_color_manual(values = iwanthue(10)[3:4])

ggarrange(p, dxyda, nrow = 2, heights = c(0.8, 1), align = "v")


Per step posterior probabilities

bestModelwProb <- bestModelw %>%
    mutate(toFilter = paste0(analysis, rep)) %>%
    filter(toFilter %in% bestReps$toFilter) %>%
    mutate(steps = case_when(modelTest %in% "migration versus isolation" ~ "Step 1",
        modelTest %in% c("AM versus SI", "IM versus SC") ~ "Step 2", modelTest %in%
            c("N-homo versus N-hetero", "M-homo versus M-hetero") ~ "Step 3"), bestModel = factor(bestModel,
        levels = c("isolation", "migration", "AM", "IM", "SC", "Nhomo", "Nhetero",
            "Mhomo", "Mhetero")))
p <- ggplot(bestModelwProb, aes(y = as.numeric(bestModelProb), x = fct_rev(analysisType),
    color = bestModel)) + geom_point() + ggrepel::geom_text_repel(aes(label = analysis),
    box.padding = 0.2, size = 2, show.legend = FALSE) + facet_wrap(~steps) + labs(y = "Posterior probability",
    x = "") + theme_bw() + scale_color_iwanthue() + theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())

# scale_color_iwanthue(name = 'Best Model', labels = c('Isolation',
# 'Migration', 'AM', 'IM', 'SC', 'N-homo','N-hetero','M-homo','M-hetero')) +
# scale_shape_manual(values = c(1, 2, 3)) +

p


Probability of migration

guide_legend2 <- function(...) ggplot2::guide_legend(...)  ##rmarkdown conflict with guide_legend

ggplot(filter(bestModelwProb, !steps %in% "Step 3"), aes(y = as.numeric(bestModelProb),
    x = fct_rev(analysisType), color = bestModel, shape = steps)) + geom_point() +
    ggrepel::geom_text_repel(aes(label = analysis), size = 2, show.legend = FALSE) +
    facet_wrap(~steps) + labs(y = "Posterior probability", x = "") + theme_bw() +
    scale_color_iwanthue(name = "Best Model", labels = c("Isolation", "Migration",
        "AM", "IM", "SC")) + scale_shape_manual(values = c(1, 2)) + theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()) + guides(shape = "none", color = guide_legend2(override.aes = list(shape = c(1,
    1, 2, 2, 2))))
step23 <- select(bestModelwProb, analysis, bestModel, steps) %>%
    filter(steps %in% c("Step 2", "Step 3")) %>%
    group_by(analysis) %>%
    summarise(bestModel23 = paste0(bestModel, collapse = "_"))

bestModelDxyDa <- bestModelw %>%
    filter(modelTest %in% "migration versus isolation") %>%
    mutate(problMigration = case_when(bestModel %in% "isolation" ~ 1 - as.numeric(bestModelProb),
        TRUE ~ as.numeric(bestModelProb)), toFilter = paste0(analysis, rep)) %>%
    filter(toFilter %in% bestReps$toFilter) %>%
    select(-modelTest, -bestModelProb) %>%
    distinct() %>%
    left_join(select(sumStats, analysis, rep, netdivAB_avg, divAB_avg)) %>%
    left_join(analyses[, c("analysis", "order")]) %>%
    left_join(step23) %>%
    arrange(as.numeric(order))

p <- ggplot(bestModelDxyDa, aes(x = divAB_avg, y = problMigration, shape = bestModel23,
    color = analysisType)) + geom_point(alpha = 0.7) + ggrepel::geom_text_repel(aes(label = analysis),
    box.padding = 0.3, segment.size = 0.3, size = 3, show.legend = FALSE) + scale_color_iwanthue() +
    scale_shape_manual(values = 0:6) + labs(y = "Probability of migration", x = "Dxy",
    shape = "Best models", color = "Analyses type") + theme_bw() + theme(panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()) + lims(x = c(0, 0.04), y = c(0, 1))
p

ggsave(paste0(folder_path, "probabilityMigration.pdf"), p, width = 9, height = 6)
p2 <- ggplot(filter(bestModelDxyDa, str_detect(analysis, "pon2|str3")), aes(x = divAB_avg,
    y = problMigration, shape = bestModel23)) + geom_point(alpha = 0.7) + ggrepel::geom_text_repel(aes(label = analysis),
    box.padding = 0.3, segment.size = 0.3, size = 3, show.legend = FALSE) + scale_shape_manual(values = 0:6) +
    labs(y = "Probability of migration", x = "Dxy", shape = "Best models") + theme_bw() +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    lims(x = c(0, 0.04), y = c(0, 1))
p2

ggsave(paste0(folder_path, "probabilityMigration_weirdsWithout.pdf"), p, width = 9,
    height = 6)

Goodness of fit - PCA on summary statistics

Within species analyses

PCA <- dilsOut %>%
    select("analysis", "species1", "species2", "analysisType", "rep", "/table_coord_PCA_SS.txt") %>%
    unnest(cols = c("/table_coord_PCA_SS.txt")) %>%
    mutate(toFilter = paste0(analysis, rep)) %>%
    filter(toFilter %in% bestReps$toFilter)

eig <- dilsOut %>%
    select("analysis", "analysisType", "species1", "species2", "rep", "/table_eigenvalues_PCA_SS.txt") %>%
    unnest(cols = c("/table_eigenvalues_PCA_SS.txt")) %>%
    mutate(toFilter = paste0(analysis, rep)) %>%
    filter(toFilter %in% bestReps$toFilter, V1 %in% c("comp 1", "comp 2")) %>%
    select(-toFilter, -eigenvalue, -`cumulative percentage of variance`) %>%
    pivot_wider(names_from = V1, values_from = `percentage of variance`)

pls <- purrr::map(which(eig$analysisType %in% "lineages"), ~ggplot(filter(PCA, analysis %in%
    unique(PCA$analysis)[.x]), aes(x = Dim.1, y = Dim.2, color = origin)) + geom_vline(xintercept = 0) +
    geom_hline(yintercept = 0) + geom_point() + theme_bw() + geom_point(data = filter(PCA,
    analysis %in% unique(PCA$analysis)[.x], origin %in% "observed dataset"), aes(x = Dim.1,
    y = Dim.2), color = "#D7191C") + scale_colour_brewer(palette = "Spectral", name = "") +
    labs(title = analyses[analyses$analysis %in% unique(PCA$analysis)[.x], "analysis"],
        x = paste0("PC1 (", round(pull(eig[eig$analysis %in% unique(PCA$analysis)[.x],
            "comp 1"]), 2), "%)"), y = paste0("PC2 (", round(pull(eig[eig$analysis %in%
            unique(PCA$analysis)[.x], "comp 2"]), 2), "%)")) + theme(plot.title = element_text(size = 9)))

q <- ggarrange(plotlist = pls, common.legend = TRUE, nrow = 4, ncol = 4)
q


Between species analyses

pls <- purrr::map(which(eig$analysisType %in% "species"), ~ggplot(filter(PCA, analysis %in%
    unique(PCA$analysis)[.x]), aes(x = Dim.1, y = Dim.2, color = origin)) + geom_vline(xintercept = 0) +
    geom_hline(yintercept = 0) + geom_point() + theme_bw() + geom_point(data = filter(PCA,
    analysis %in% unique(PCA$analysis)[.x], origin %in% "observed dataset"), aes(x = Dim.1,
    y = Dim.2), color = "#D7191C") + scale_colour_brewer(palette = "Spectral", name = "") +
    labs(title = analyses[analyses$analysis %in% unique(PCA$analysis)[.x], "analysis"],
        x = paste0("PC1 (", round(pull(eig[eig$analysis %in% unique(PCA$analysis)[.x],
            "comp 1"]), 2), "%)"), y = paste0("PC2 (", round(pull(eig[eig$analysis %in%
            unique(PCA$analysis)[.x], "comp 2"]), 2), "%)")) + theme(plot.title = element_text(size = 9)))

q <- ggarrange(plotlist = pls, common.legend = TRUE, nrow = 2, ncol = 4)
q


Parameters

Model parameters definitions

Population sizes
       * Na: effective size of the ancestral population [# of diploid individuals]
       * N1; N2: effective size of population 1 (resp. 2) [# of diploid individuals]
        If Ne is genomically heterogeneous
               * shape_N_a; shape_N_b: shape parameter α (resp. β) of the Beta(α, β) distribution for Ne (shared by all populations)
        If there is a demographic change
               * Tdem1; Tdem2: time of the demographic change in population 1 (resp. 2) [# of generations]
               * founders1; founders2: number of founder individuals in population 1 (resp. 2) at the time of the demographic change Tdem1 (and Tdem2)
Times for two populations or more
       * Tsplit: time of split at which the ancestral population subdivides in two populations [# of generations]
        If there is both gene flow and isolation
               * Tsc: time of secondary contact at which the two populations start exchanging genes [# of generations]
               * Tam: time of ancient migration at which the two populations stop exchanging genes [# of generations]
Migration and barriers
       * M12; M21: introgression rate from population 2 to 1 (resp. from 1 to 2) [# of migrants per generation]
        If N.m is genomically heterogeneous (bimodal model)
               * nBarriersM12; nBarriersM21: number of loci inferred as interspecies barriers for introgression from population 2 to 1 (resp. from 1 to 2)
        If N.m is genomically heterogeneous (beta model)
               * shape_M12_a; shape_M12_b: shape parameter α (resp. β) of the Beta(α, β) distribution for N.m (from population 2 to 1)
               * shape_M21_a; shape_M21_b: shape parameter α (resp. β) of the Beta(α β) distribution for N.m (from population 1 to 2)

Figure

estPar <- dilsOut %>%
    select("analysis", "analysisType", "species1", "species2", "rep", "/best_model_5/posterior_summary_RandomForest_bestModel.txt") %>%
    group_by(analysis) %>%
    unnest(cols = c(`/best_model_5/posterior_summary_RandomForest_bestModel.txt`)) %>%
    left_join(., bestModel[c("analysis", "rep", "migration versus isolation_bestModel")],
        by = c("analysis", "rep")) %>%
    rename(step1 = "migration versus isolation_bestModel") %>%
    distinct() %>%
    left_join(analyses[, c("analysis", "order")]) %>%
    arrange(as.numeric(order)) %>%
    mutate(toFilter = paste0(analysis, rep)) %>%
    filter(toFilter %in% bestReps$toFilter)

estPar$param_f = factor(estPar$param, levels = c("Na", "N1", "N2", "founders1", "founders2",
    "Tsplit", "Tam", "Tsc", "Tdem1", "Tdem2", "M12", "M21", "shape_N_a", "shape_N_b",
    "nBarriersM12", "nBarriersM21"))

pars <- ggplot(estPar, aes(y = fct_rev(fct_inorder(analysis)), x = median, color = analysisType)) +
    geom_point(size = 1) + geom_linerange(aes(xmin = `HPD2.5%`, xmax = `HPD%97.5`)) +
    facet_wrap(~param_f, ncol = 4, scales = "free_x") + theme_bw() + labs(y = "Lineages comparison",
    x = "Estimated parameters", color = "Analyses type") + theme(legend.position = "bottom",
    axis.text.y = element_text(size = 5)) + scale_color_iwanthue()

pars